Frequent Itemset Exploration (Item = template)

In this notebook we explore the results of a frequent itemset calculation on the encoded notebooks. We encode notebooks using the top-down and bottom-up methods that we have been working with, then gather our buckets and run the fpgrowth algorithm to recognize common patterns

Bucket Size

It's important to look at some stats about the bucket size when using cells as buckets


In [70]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
from nbminer.stats.summarize_corpus import SummarizeCorpus

people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks[:5]]

a = Features(notebook_objs)
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 5)
fig, axes = plt.subplots(1,2)
axes[0].hist(SummarizeCorpus(a).get_cell_sizes(), bins=25)
axes[0].set_xlabel('Lines of Code')
axes[0].set_ylabel('Number of Occurences')
axes[1].hist(SummarizeCorpus(a).get_top_level(), bins=25)
axes[1].set_xlabel('AST Top Level Nodes')
axes[1].set_ylabel('Number of Occurences')


Out[70]:
<matplotlib.text.Text at 0x1a63af3be0>

Part 1

Template Generation: Bottom Up

Buckets: Cells


In [9]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *

In [10]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [11]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets

a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=True)
fi = FrequentItemsets()
pipe = Pipeline([rbn,  agr, gi, fi])
a = pipe.transform(a)

fi_bu_cells = fi
agr_cells = agr


<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151f4a52b0>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x151f9a09b0>
<nbminer.preprocess.get_imports.GetImports object at 0x15118f01d0>
<nbminer.freq_itemsets.frequent_itemsets.FrequentItemsets object at 0x151f9a0978>
7563
0.07852983474731445

In [12]:
print("Number of Cells: \n", fi_bu_cells.get_number_buckets())
print("Number of Itemsets: \n", fi_bu_cells.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_bu_cells.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_bu_cells.get_avg_number(min_pattern=2))


Number of Cells: 
 7563
Number of Itemsets: 
 748
Percentage of Cells with Itemsets: 
 0.6357265635329895
Average number of Itemsets per Cell (Excluding cells without itemsets):
 3.973687690070078

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_bu_cells.get_pattern_lengths())


Out[13]:
(array([ 261.,  184.,  154.,  150.,    0.,  130.,   84.,   36.,    9.,    1.]),
 array([ 1. ,  1.8,  2.6,  3.4,  4.2,  5. ,  5.8,  6.6,  7.4,  8.2,  9. ]),
 <a list of 10 Patch objects>)

In [ ]:


In [ ]:


In [25]:
import operator
def get_bar_plot_info(functions):
    sorted_functions = sorted(functions.items(), key=operator.itemgetter(1))
    x = []
    y = []
    x_ticks = []
    i=0
    for el in sorted_functions:
        i+=1
        x.append(i)
        y.append(el[1])
        x_ticks.append(el[0])
    return x,y,x_ticks

In [15]:
# 1, 0 - label encoder
# 4, 0 - subplots
# 2, 2 - load

pattern = fi_bu_cells.get_patterns(2)[2]
functions = fi_bu_cells.get_function_dict(pattern)
all_functions = fi_bu_cells.get_full_function_dict(pattern)
maxim = fi_bu_cells.get_number_matches(pattern)

plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)

fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)


Out[15]:
<matplotlib.lines.Line2D at 0x15125f4e48>

In [16]:
import astor
l = pattern #fi_bu_cells.get_patterns(1)[0]
for el in l:
    if el == '':
        continue
    print (astor.to_source(agr_cells.templates.get_random_example(el)))
print(fi_bu_cells.print_itemset_examples(l, 2))


var.legend((var[0], var[0]), ('retweets', 'favorites'))

var = 0.35

Template: ('template_3213', 'template_91')
EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = var['retweet_count'].sum()
var = var['favorite_count'].sum()
var = var['retweet_count'].sum()
var = var['favorite_count'].sum()
var = 2
var = np.arange(var)
var = 0.35
var, var = plt.subplots()
var = var.bar(var, (var, var), var, color='r')
var = var.bar(var + var, (var, var), var, color='y')
var.set_ylabel('Scores')
var.set_title('EPFL vs ETH retweet and favorite')
var.set_xticks(var + var / 2)
var.set_xticklabels(('EPFL', 'ETH'))
var.legend((var[0], var[0]), ('retweets', 'favorites'))
def autolabel(rects):
    """
    Attach a text label above each bar displaying its height
    """
    for var in var:
        var = var.get_height()
        var.text(var.get_x() + var.get_width() / 2.0, 1.05 * var, '%d' %
            int(var), ha='center', va='bottom')
var(var)
var(var)
plt.show()
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = 2
var = np.arange(var)
var = 0.35
var, var = plt.subplots()
var = var.bar(var, (sum(var), sum(var)), var, color='r')
var = var.bar(var + var, (sum(var), sum(var)), var, color='y')
var.set_ylabel('Scores')
var.set_title('EPFL vs ETH relevance')
var.set_xticks(var + var / 2)
var.set_xticklabels(('EPFL', 'ETH'))
var.legend((var[0], var[0]), ('retweets', 'favorites'))
def autolabel(rects):
    """
    Attach a text label above each bar displaying its height
    """
    for var in var:
        var = var.get_height()
        var.text(var.get_x() + var.get_width() / 2.0, 1.05 * var, '%d' %
            int(var), ha='center', va='bottom')
var(var)
var(var)
plt.show()
****************************************************************************************************





In [27]:
# 1, 0 - label encoder
# 4, 0 - subplots
# 2, 2 - load

patterns = fi_bu_cells.get_patterns(1)
functions = {}
for pattern in patterns:
    functions = fi_bu_cells.get_function_dict(pattern, functions)
all_functions = {}
for pattern in patterns:
    all_functions = fi_bu_cells.get_full_function_dict(pattern, all_functions)

plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)

plt.bar(x, y, align='center', tick_label=x_ticks)
for i in range(len(x)):
    print(x_ticks[i], y[i])


LabelEncoder 10
yscale 10
PorterStemmer 10
use 11
RegexpTokenizer 11
boxplot 12
countplot 13
unique 13
fillna 18
dict 22
display 22
open 22
score 22
pie 24
dropna 26
set_xticklabels 26
Series 26
set_index 27
list 28
WordNetLemmatizer 28
add_subplot 29
range 30
bar 31
join 31
isnull 34
agg 36
format 40
concat 41
words 45
value_counts 47
suptitle 48
LinearRegression 56
load 62
tolist 63
LdaModel 68
generate 72
WordCloud 72
print_topics 75
hist 80
sample 82
RandomForestRegressor 87
reset_index 94
set 95
mean 108
DataFrame 110
tight_layout 111
Patch 120
len 122
train_test_split 125
copy 133
cross_val_score 134
predict 152
Dictionary 174
count 192
axis 195
set_context 206
doc2bow 236
imshow 236
describe 271
read_json 280
legend 282
apply 310
arange 328
figure 329
fit 341
subplot 504
sum 536
drop 562
groupby 718
plot 939
magic 975
xlabel 1044
print 1096
head 1128
var 1129
ylabel 1416
title 1653
show 1918
yticks 3072
setp 3072
subplots 3182
barplot 3251
set_title 3466
set_ylabel 3573
set_xlabel 3608
xticks 6190

In [ ]:

Part 2

Template Generation: Top Down

Buckets: Cells


In [10]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [11]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets

a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 500)
fi = FrequentItemsets()
pipe = Pipeline([rbn, gi, fe, ke, fi])
a = pipe.transform(a)

fi_td_cells = fi
ke_cells = ke


<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1542464e10>
<nbminer.preprocess.get_imports.GetImports object at 0x10758f908>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x15399761d0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x15391609e8>
<nbminer.freq_itemsets.frequent_itemsets.FrequentItemsets object at 0x15396ad518>
7563
0.08916783332824707

In [12]:
print("Number of Cells: \n", fi_td_cells.get_number_buckets())
print("Number of Itemsets: \n", fi_td_cells.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_td_cells.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_td_cells.get_avg_number(min_pattern=2))


Number of Cells: 
 7563
Number of Itemsets: 
 669
Percentage of Cells with Itemsets: 
 0.26669311119925954
Average number of Itemsets per Cell (Excluding cells without itemsets):
 3.2371933621933624

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_td_cells.get_pattern_lengths())


Out[13]:
(array([ 122.,  241.,  199.,    0.,  128.,   64.,    0.,   28.,    8.,    1.]),
 array([ 1. ,  1.7,  2.4,  3.1,  3.8,  4.5,  5.2,  5.9,  6.6,  7.3,  8. ]),
 <a list of 10 Patch objects>)

In [80]:
#1, 0
#1, 2 - Read, open
pattern = fi_td_cells.get_patterns(1)[2]
functions = fi_td_cells.get_function_dict(pattern)
all_functions = fi_td_cells.get_full_function_dict(pattern)
maxim = fi_td_cells.get_number_matches(pattern)

plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)

fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)


Out[80]:
<matplotlib.lines.Line2D at 0x154fb28c88>

In [81]:
import astor
l = pattern #fi_td_cells.get_patterns(1)[0]
for el in l:
    if el == '':
        continue
    print (astor.to_source(ke_cells.templates.get_random_example(el)))
print(fi_td_cells.print_itemset_examples(l, 4))


with open('eth_en.json', encoding='utf8') as var:
    var = json.loads('[{}]'.format(var.read()))

Template: ('template_305',)
EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = 'epfl_en.json'
var = 'eth_en.json'
var = open(var).read()
var = json.loads(var)
var = open(var).read()
var = json.loads(var)
var = len(var)
var = len(var)
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
def countWords(epfl_text):
    var.to_csv('epfl_text.csv')
    with open('epfl_text.csv') as var:
        var = Counter(var.read().split())
    return var
def removeStopwords(wordcount):
    for var in var:
        if var[var] > 0:
            var.pop(var, None)
    return var
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
with open('epfl_en.json', 'r') as var:
    var = json.loads(var.read())
with open('eth_en.json', 'r') as var:
    var = json.loads(var.read())
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var['text'].to_csv('text.csv', encoding='utf-8')
var = open('text.csv', 'r', encoding='utf-8').read()
****************************************************************************************************





In [ ]:


In [ ]:


In [ ]:

Part 3

Template Generation: Bottom Up

Buckets: 4-Grams


In [23]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [24]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.freq_itemsets.frequent_gram_itemsets import FrequentGramItemsets

a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=True)
fi = FrequentItemsets()
pipe = Pipeline([rbn, gi, agr, fi])
a = pipe.transform(a)

agr_4gram = agr
fi_bu_4gram = fi


<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a756d2f98>
<nbminer.preprocess.get_imports.GetImports object at 0x1548377160>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x1a6b5a9fd0>
<nbminer.freq_itemsets.frequent_itemsets.FrequentItemsets object at 0x1a6b5a9630>
7563
0.08259081840515137

In [25]:
print("Number of Cells: \n", fi_bu_4gram.get_number_buckets())
print("Number of Itemsets: \n", fi_bu_4gram.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_bu_4gram.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_bu_4gram.get_avg_number(min_pattern=2))


Number of Cells: 
 7563
Number of Itemsets: 
 750
Percentage of Cells with Itemsets: 
 0.682665608885363
Average number of Itemsets per Cell (Excluding cells without itemsets):
 4.078804707126801

In [26]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_bu_4gram.get_pattern_lengths())


Out[26]:
(array([ 270.,  186.,  154.,  150.,    0.,  130.,   84.,   36.,    9.,    1.]),
 array([ 1. ,  1.8,  2.6,  3.4,  4.2,  5. ,  5.8,  6.6,  7.4,  8.2,  9. ]),
 <a list of 10 Patch objects>)

In [27]:
pattern = fi_bu_4gram.get_patterns()[0]
functions = fi_bu_4gram.get_function_dict(pattern)
all_functions = fi_bu_4gram.get_full_function_dict(pattern)
maxim = fi_bu_4gram.get_number_matches(pattern)

plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)

fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)


Out[27]:
<matplotlib.lines.Line2D at 0x1548648780>

In [28]:
import astor
l = fi_bu_4gram.get_patterns(1)[0]
for el in l:
    if el == '':
        continue
    print (astor.to_source(agr_4gram.templates.get_random_example(el)))
print(fi_bu_4gram.print_itemset_examples(l, 2))


var = preprocessing.LabelEncoder()

Template: ('template_1580',)
EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var['in_reply_to_screen_name'] = var['in_reply_to_screen_name'].fillna(
    'Unknown')
var['in_reply_to_screen_name'] = var['in_reply_to_screen_name'].fillna(
    'Unknown')
var['month'] = var['month'].fillna('Unknown')
var['month'] = var['month'].fillna('Unknown')
var = preprocessing.LabelEncoder()
var.fit(var['in_reply_to_screen_name'])
var['in_reply_to_screen_name'] = var.transform(var['in_reply_to_screen_name'])
var.fit(var['month'])
var['month'] = var.transform(var['month'])
var.fit(var['in_reply_to_screen_name'])
var['in_reply_to_screen_name'] = var.transform(var['in_reply_to_screen_name'])
var.fit(var['month'])
var['month'] = var.transform(var['month'])
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = preprocessing.LabelEncoder()
var['lang'] = var.fit_transform(var['lang'])
var.head()
****************************************************************************************************





In [ ]:


In [ ]:


In [ ]:


In [ ]:

Part 4

Template Generation: Top Down

Buckets: N-Gram


In [29]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [30]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.freq_itemsets.frequent_gram_itemsets import FrequentGramItemsets

a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 500)
fi = FrequentItemsets()
pipe = Pipeline([rbn, gi, fe, ke, fi])
a = pipe.transform(a)

ke_4gram = ke
fi_td_4gram = fi


<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1522a66a58>
<nbminer.preprocess.get_imports.GetImports object at 0x1a61d9ce80>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x154341d828>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x154340c0f0>
<nbminer.freq_itemsets.frequent_itemsets.FrequentItemsets object at 0x15399761d0>
7563
0.09662890434265137

In [31]:
print("Number of Cells: \n", fi_td_4gram.get_number_buckets())
print("Number of Itemsets: \n", fi_td_4gram.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_td_4gram.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_td_4gram.get_avg_number(min_pattern=2))


Number of Cells: 
 7563
Number of Itemsets: 
 693
Percentage of Cells with Itemsets: 
 0.26708977918815285
Average number of Itemsets per Cell (Excluding cells without itemsets):
 3.2764843890994406

In [32]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_td_4gram.get_pattern_lengths())


Out[32]:
(array([ 121.,  241.,  207.,    0.,  138.,   69.,    0.,   29.,    8.,    1.]),
 array([ 1. ,  1.7,  2.4,  3.1,  3.8,  4.5,  5.2,  5.9,  6.6,  7.3,  8. ]),
 <a list of 10 Patch objects>)

In [33]:
pattern = fi_td_4gram.get_patterns(1)[0]
functions = fi_td_4gram.get_function_dict(pattern)
all_functions = fi_td_4gram.get_full_function_dict(pattern)
maxim = fi_td_4gram.get_number_matches(pattern)

plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)

fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)


Out[33]:
<matplotlib.lines.Line2D at 0x15401afda0>

In [66]:
import astor
l = fi_td_4gram.get_patterns(1)[0]
for el in l:
    if el == '':
        continue
    print (astor.to_source(ke_4gram.templates.get_random_example(el)))
print(fi_td_4gram.print_itemset_examples(l, 10))


var = np.array(var.retweet_count)

Template: ('template_483',)
EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = np.array(var)
var
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = var(var, var)
var = np.array(var)
var
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = np.array(var.get_feature_names())
var
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var['Year^2'] = np.square(np.array(var['Year']))
var['Year^3'] = np.square(np.array(var['Year'])) * np.array(var['Year'])
var['Month^2'] = np.square(np.array(var['Month']))
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = np.array(var.retweet_count)
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = np.array(var.retweet_count)
var = np.array(var.drop('retweet_count', axis=1))
var = RandomForestRegressor()
var.fit(var, var)
var = var.predict(var)
var = plt.subplot()
plt.plot(var, var, 'o')
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = np.array(var.retweet_count)
var = np.array(var.drop('retweet_count', axis=1))
var = RandomForestRegressor()
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = var[['favorite_count', 'year', 'month', 'hour', 'weekday',
    'retweet_count']]
var = np.array(var[['favorite_count', 'year', 'month', 'hour', 'weekday']])
var = np.array(var['retweet_count'])
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
var = np.array(var['retweet_count'])
var = LabelEncoder()
var = var.fit_transform(var)
var
****************************************************************************************************



EXAMPLE CELL CODE FOR THIS TEMPLATE
****************************************************************************************************
def preprocess_regressor(df):
    var = var[['favorite_count', 'favorited', 'is_quote_status', 'lang',
        'truncated', 'year', 'month', 'hour', 'retweet_count']]
    var.lang = var.lang == 'en'
    var = var[var]
    var = var[var.retweet_count < 260]
    var = np.array(var)
    var
    return np.array(var)
****************************************************************************************************





In [ ]:


In [ ]: